import pandas as pd
import os
os.chdir(r'C:/Users/HP/Desktop')
df = pd.read_csv("Travel details dataset.csv")
df.head()
| Trip ID | Destination | Start date | End date | Duration (days) | Traveler name | Traveler age | Traveler gender | Traveler nationality | Accommodation type | Accommodation cost | Transportation type | Transportation cost | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | London, UK | 5/1/2023 | 5/8/2023 | 7.0 | John Smith | 35.0 | Male | American | Hotel | 1200 | Flight | 600 |
| 1 | 2 | Phuket, Thailand | 6/15/2023 | 6/20/2023 | 5.0 | Jane Doe | 28.0 | Female | Canadian | Resort | 800 | Flight | 500 |
| 2 | 3 | Bali, Indonesia | 7/1/2023 | 7/8/2023 | 7.0 | David Lee | 45.0 | Male | Korean | Villa | 1000 | Flight | 700 |
| 3 | 4 | New York, USA | 8/15/2023 | 8/29/2023 | 14.0 | Sarah Johnson | 29.0 | Female | British | Hotel | 2000 | Flight | 1000 |
| 4 | 5 | Tokyo, Japan | 9/10/2023 | 9/17/2023 | 7.0 | Kim Nguyen | 26.0 | Female | Vietnamese | Airbnb | 700 | Train | 200 |
df.tail()
| Trip ID | Destination | Start date | End date | Duration (days) | Traveler name | Traveler age | Traveler gender | Traveler nationality | Accommodation type | Accommodation cost | Transportation type | Transportation cost | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 134 | 135 | Rio de Janeiro, Brazil | 8/1/2023 | 8/10/2023 | 9.0 | Jose Perez | 37.0 | Male | Brazilian | Hostel | 2500 | Car | 2000 |
| 135 | 136 | Vancouver, Canada | 8/15/2023 | 8/21/2023 | 6.0 | Emma Wilson | 29.0 | Female | Canadian | Hotel | 5000 | Airplane | 3000 |
| 136 | 137 | Bangkok, Thailand | 9/1/2023 | 9/8/2023 | 7.0 | Ryan Chen | 34.0 | Male | Chinese | Hostel | 2000 | Train | 1000 |
| 137 | 138 | Barcelona, Spain | 9/15/2023 | 9/22/2023 | 7.0 | Sofia Rodriguez | 25.0 | Female | Spanish | Airbnb | 6000 | Airplane | 2500 |
| 138 | 139 | Auckland, New Zealand | 10/1/2023 | 10/8/2023 | 7.0 | William Brown | 39.0 | Male | New Zealander | Hotel | 7000 | Train | 2500 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 139 entries, 0 to 138 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Trip ID 139 non-null int64 1 Destination 137 non-null object 2 Start date 137 non-null object 3 End date 137 non-null object 4 Duration (days) 137 non-null float64 5 Traveler name 137 non-null object 6 Traveler age 137 non-null float64 7 Traveler gender 137 non-null object 8 Traveler nationality 137 non-null object 9 Accommodation type 137 non-null object 10 Accommodation cost 137 non-null object 11 Transportation type 136 non-null object 12 Transportation cost 136 non-null object dtypes: float64(2), int64(1), object(10) memory usage: 14.2+ KB
df.describe()
| Trip ID | Duration (days) | Traveler age | |
|---|---|---|---|
| count | 139.000000 | 137.000000 | 137.000000 |
| mean | 70.000000 | 7.605839 | 33.175182 |
| std | 40.269923 | 1.601276 | 7.145441 |
| min | 1.000000 | 5.000000 | 20.000000 |
| 25% | 35.500000 | 7.000000 | 28.000000 |
| 50% | 70.000000 | 7.000000 | 31.000000 |
| 75% | 104.500000 | 8.000000 | 38.000000 |
| max | 139.000000 | 14.000000 | 60.000000 |
df.shape
(139, 13)
print(df.columns)
Index(['Trip ID', 'Destination', 'Start date', 'End date', 'Duration (days)',
'Traveler name', 'Traveler age', 'Traveler gender',
'Traveler nationality', 'Accommodation type', 'Accommodation cost',
'Transportation type', 'Transportation cost'],
dtype='object')
# Check for missing values
print(df.isnull().sum())
# You can either drop rows with missing values or fill them
df = df.dropna() # Drop rows with missing values
Trip ID 0 Destination 2 Start date 2 End date 2 Duration (days) 2 Traveler name 2 Traveler age 2 Traveler gender 2 Traveler nationality 2 Accommodation type 2 Accommodation cost 2 Transportation type 3 Transportation cost 3 dtype: int64
# Convert 'Start date' and 'End date' to datetime format
df['Start date'] = pd.to_datetime(df['Start date'], errors='coerce')
df['End date'] = pd.to_datetime(df['End date'], errors='coerce')
# Calculate trip duration based on dates (Duration in days)
df['Duration (days)'] = (df['End date'] - df['Start date']).dt.days
# Check the new dataset
print(df[['Start date', 'End date', 'Duration (days)']].head())
Start date End date Duration (days) 0 2023-05-01 2023-05-08 7 1 2023-06-15 2023-06-20 5 2 2023-07-01 2023-07-08 7 3 2023-08-15 2023-08-29 14 4 2023-09-10 2023-09-17 7
# Convert 'Start date' and 'End date' to datetime format
df['Start date'] = pd.to_datetime(df['Start date'], errors='coerce')
df['End date'] = pd.to_datetime(df['End date'], errors='coerce')
# Calculate trip duration based on dates (Duration in days)
df['Duration (days)'] = (df['End date'] - df['Start date']).dt.days
# Check the new dataset
print(df[['Start date', 'End date', 'Duration (days)']].head())
Start date End date Duration (days) 0 2023-05-01 2023-05-08 7 1 2023-06-15 2023-06-20 5 2 2023-07-01 2023-07-08 7 3 2023-08-15 2023-08-29 14 4 2023-09-10 2023-09-17 7
# Remove non-numeric characters (like '$', 'USD', etc.) from 'Accommodation cost' and 'Transportation cost'
df['Accommodation cost'] = df['Accommodation cost'].replace(r'[^\d.]', '', regex=True).astype(float)
df['Transportation cost'] = df['Transportation cost'].replace(r'[^\d.]', '', regex=True).astype(float)
# Fill any missing values with the median (if applicable)
df['Accommodation cost'].fillna(df['Accommodation cost'].median(), inplace=True)
df['Transportation cost'].fillna(df['Transportation cost'].median(), inplace=True)
# Check the cleaned columns
print(df[['Accommodation cost', 'Transportation cost']].head())
Accommodation cost Transportation cost 0 1200.0 600.0 1 800.0 500.0 2 1000.0 700.0 3 2000.0 1000.0 4 700.0 200.0
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Trip ID | 136.0 | 69.463235 | 40.388284 | 1.0 | 34.75 | 68.5 | 104.25 | 139.0 |
| Duration (days) | 136.0 | 7.536765 | 1.558290 | 5.0 | 7.00 | 7.0 | 8.00 | 14.0 |
| Traveler age | 136.0 | 33.117647 | 7.139933 | 20.0 | 28.00 | 31.0 | 37.25 | 60.0 |
| Accommodation cost | 136.0 | 1253.529412 | 1338.644162 | 150.0 | 600.00 | 900.0 | 1200.00 | 8000.0 |
| Transportation cost | 136.0 | 645.183824 | 584.476153 | 20.0 | 200.00 | 550.0 | 800.00 | 3000.0 |
df = df.dropna()
df = df.reset_index()
col_list = ["Transportation cost", "Accommodation cost"]
for col in col_list:
df[col] = df[col].astype(str)
df[col] = df[col].str.replace("$", "")
df[col] = df[col].str.replace(" USD", "")
df[col] = df[col].str.replace(",", "")
df[col] = pd.to_numeric(df[col])
C:\Users\HP\AppData\Local\Temp\ipykernel_2396\1584975083.py:5: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df["Transportation type"] = df["Transportation type"].astype(str)
group_accommodation_type = df.groupby("Accommodation type").agg({"Trip ID":"count", "Duration (days)":"mean", "Traveler age":"mean",
"Accommodation cost" : "mean", "Transportation type" : "max",
"Transportation cost" : "mean"}).reset_index()
group_nationality = df.groupby("Traveler nationality").agg({"Trip ID":"count", "Duration (days)":"mean", "Traveler age":"mean",
"Accommodation cost" : "mean", "Transportation type" : "max",
"Transportation cost" : "mean"}).reset_index()
group_transportation_type = df.groupby("Transportation type").agg({"Trip ID":"count", "Duration (days)":"mean", "Traveler age":"mean",
"Accommodation cost" : "mean","Transportation cost" : "mean"}).reset_index()
group_destination_nationality = df.groupby(["Destination", "Traveler nationality"]).agg({"Accommodation cost":"mean", "Trip ID": "count"}).reset_index()
group_destination_nationality = group_destination_nationality.sort_values("Accommodation cost", ascending = False)
import plotly.express as px
import plotly.graph_objs as go
import warnings
group_accommodation_type = group_accommodation_type.sort_values(by='Accommodation cost')
sorted_accommodation_types = group_accommodation_type['Accommodation type'].unique()
fig = px.bar(
group_accommodation_type,
y='Accommodation type',
x='Accommodation cost',
color='Accommodation cost',
orientation='h',
color_continuous_scale='blugrn'
)
fig.update_layout(
title_x=0.5,
title_font_size=20,
legend=dict(
font=dict(size=12),
bgcolor='rgba(255, 255, 255, 0.5)',
bordercolor='rgba(0, 0, 0, 0.5)',
borderwidth=2
),
margin=dict(
l=50,
r=50,
b=50,
t=80
))
fig.show()
import pandas as pd
import matplotlib.pyplot as plt
# take only head(5)
data = {
'Trip ID': [134, 135, 136, 137, 138],
'Destination': ['Rio de Janeiro, Brazil', 'Vancouver, Canada', 'Bangkok, Thailand', 'Barcelona, Spain', 'Auckland, New Zealand'],
'Start date': ['8/1/2023', '8/15/2023', '9/1/2023', '9/15/2023', '10/1/2023'],
'End date': ['8/10/2023', '8/21/2023', '9/8/2023', '9/22/2023', '10/8/2023'],
'Duration (days)': [9, 6, 7, 7, 7],
'Traveler name': ['Jose Perez', 'Emma Wilson', 'Ryan Chen', 'Sofia Rodriguez', 'William Brown'],
'Traveler age': [37, 29, 34, 25, 39],
'Traveler gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
'Traveler nationality': ['Brazilian', 'Canadian', 'Chinese', 'Spanish', 'New Zealander'],
'Accommodation type': ['Hostel', 'Hotel', 'Hostel', 'Airbnb', 'Hotel'],
'Accommodation cost': [2500, 5000, 2000, 6000, 7000],
'Transportation type': ['Car', 'Airplane', 'Train', 'Airplane', 'Train'],
'Transportation cost': [2000, 3000, 1000, 2500, 2500]
}
# Create DataFrame
df = pd.DataFrame(data)
# Calculate average transportation cost by transportation type
avg_transport_cost = df.groupby('Transportation type')['Transportation cost'].mean()
# Plotting
plt.figure(figsize=(10, 6))
avg_transport_cost.plot(kind='bar')
plt.title('Average Transportation Cost by Transportation Type')
plt.xlabel('Transportation Type')
plt.ylabel('Average Transportation Cost')
plt.xticks(rotation=45)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Sample data
data = {
'Trip ID': [134, 135, 136, 137, 138],
'Destination': ['Rio de Janeiro, Brazil', 'Vancouver, Canada', 'Bangkok, Thailand', 'Barcelona, Spain', 'Auckland, New Zealand'],
'Start date': ['8/1/2023', '8/15/2023', '9/1/2023', '9/15/2023', '10/1/2023'],
'End date': ['8/10/2023', '8/21/2023', '9/8/2023', '9/22/2023', '10/8/2023'],
'Duration (days)': [9, 6, 7, 7, 7],
'Traveler name': ['Jose Perez', 'Emma Wilson', 'Ryan Chen', 'Sofia Rodriguez', 'William Brown'],
'Traveler age': [37, 29, 34, 25, 39],
'Traveler gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
'Traveler nationality': ['Brazilian', 'Canadian', 'Chinese', 'Spanish', 'New Zealander'],
'Accommodation type': ['Hostel', 'Hotel', 'Hostel', 'Airbnb', 'Hotel'],
'Accommodation cost': [2500, 5000, 2000, 6000, 7000],
'Transportation type': ['Car', 'Airplane', 'Train', 'Airplane', 'Train'],
'Transportation cost': [2000, 3000, 1000, 2500, 2500]
}
# Create DataFrame
df = pd.DataFrame(data)
# Plotting the relationship between Duration (days) and Accommodation cost
plt.figure(figsize=(10, 6))
plt.scatter(df['Duration (days)'], df['Accommodation cost'])
plt.title('Relationship between Duration of Stay and Accommodation Cost')
plt.xlabel('Duration (days)')
plt.ylabel('Accommodation Cost')
plt.grid(True)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Sample data
data = {
'Trip ID': [134, 135, 136, 137, 138],
'Destination': ['Rio de Janeiro, Brazil', 'Vancouver, Canada', 'Bangkok, Thailand', 'Barcelona, Spain', 'Auckland, New Zealand'],
'Start date': ['8/1/2023', '8/15/2023', '9/1/2023', '9/15/2023', '10/1/2023'],
'End date': ['8/10/2023', '8/21/2023', '9/8/2023', '9/22/2023', '10/8/2023'],
'Duration (days)': [9, 6, 7, 7, 7],
'Traveler nationality': ['Brazilian', 'Canadian', 'Chinese', 'Spanish', 'New Zealander'],
'Accommodation type': ['Hostel', 'Hotel', 'Hostel', 'Airbnb', 'Hotel'],
'Accommodation cost': [2500, 5000, 2000, 6000, 7000],
'Transportation type': ['Car', 'Airplane', 'Train', 'Airplane', 'Train'],
'Transportation cost': [2000, 3000, 1000, 2500, 2500]
}
# Create DataFrame
df = pd.DataFrame(data)
# Calculate average duration by transportation type
avg_duration = df.groupby('Transportation type')['Duration (days)'].mean()
# Plotting
plt.figure(figsize=(10, 6))
avg_duration.plot(kind='bar', color=['skyblue', 'coral', 'limegreen'])
plt.title('Average Duration of Stay by Transportation Type')
plt.xlabel('Transportation Type')
plt.ylabel('Average Duration of Stay (days)')
plt.xticks(rotation=45)
plt.show()
# Calculate total travel cost
df['Total travel cost'] = df['Accommodation cost'] + df['Transportation cost']
print(df[['Trip ID', 'Destination', 'Total travel cost']])
Trip ID Destination Total travel cost 0 134 Rio de Janeiro, Brazil 4500 1 135 Vancouver, Canada 8000 2 136 Bangkok, Thailand 3000 3 137 Barcelona, Spain 8500 4 138 Auckland, New Zealand 9500
# Sample only for first 5 people
plt.figure(figsize=(12, 6))
plt.bar(df['Destination'], df['Total travel cost'], color='skyblue')
plt.title('Total Travel Cost by Destination')
plt.xlabel('Destination')
plt.ylabel('Total Travel Cost')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
plt.figure(figsize=(12, 6))
bar_width = 0.4
positions = range(len(df['Destination']))
plt.bar(positions, df['Accommodation cost'], bar_width, label='Accommodation Cost', color='skyblue')
plt.bar(positions, df['Transportation cost'], bar_width, bottom=df['Accommodation cost'], label='Transportation Cost', color='coral')
plt.title('Accommodation and Transportation Costs by Destination')
plt.xlabel('Destination')
plt.ylabel('Cost')
plt.xticks(positions, df['Destination'], rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Plot histograms of 'Accommodation cost' and 'Transportation cost'
df[['Accommodation cost', 'Transportation cost']].hist(bins=20, figsize=(10, 5))
plt.tight_layout()
plt.show()
# Boxplot to check for outliers
sns.boxplot(data=df[['Accommodation cost', 'Transportation cost']])
plt.show()